

********************************************************************************
*
* This is the STATA do file we ran to create the datasets for making Figures 
* 1, 2, and 3 in the main paper. The datasets built are passed into the RScript
* file analysis_MainPaper.R to finalize the figures.
*    
********************************************************************************


cap cd "YOUR WORKING DIRECTORY HERE"

// Load the patent-mesh dataset 
use "clean_data/patent_data/patent_gbd_level.dta", clear

// Drop the previous tag variables
drop *tag 

// Generate patent tag variable 
egen patent_tag = tag(patent_id)
replace patent_tag = 0 if mi(patent_id)

// Keep unique patent observations only 
keep if patent_tag 

gen sub_drug = subcategory_id == 31
gen sub_surgery_inst = subcategory_id == 32
gen sub_biotech = subcategory_id == 33
gen sub_miscellaneous = subcategory_id == 39



gen patent_only_female = patent_female & !patent_male
gen patent_only_male = !patent_female & patent_male

gen patent_net_fm = patent_only_female - patent_only_male

merge 1:1 patent_id using "clean_data/patent_data/inventor_gender_counts.dta"
drop if _m == 2
drop _m

gen all_female_team = female_count ==  pat_team_sz 
gen all_female = female_count ==  inventor_gender_count 


// We double the data so that we have 3 groups we can plot by:
// (1) All patents, (2) female majority, and (3) male majority
expand 2, gen(expander)

replace pat_female_member = 2 if expander == 1


// Majority female?
drop if female_count == .
drop if male_count == .

gen pat_majority_female = female_count >= male_count  

replace pat_majority_female = 2 if expander == 1


gen patent_year2 = patent_year - mod(patent_year,2) 
replace patent_year2 = 2008 if patent_year2 == 2010
replace patent_year2 = patent_year2 + 1

grstyle init plain, replace
grstyle set plain, horizontal nogrid

gen pat_majority_female100 = pat_majority_female*100
gen patent_female100 = patent_female*100
gen patent_male100 = patent_male*100
gen patent_net_fm100 = patent_net_fm*100


// Create three datasets that are used for plotting in R for the final
// version of the paper
preserve
	keep if pat_majority_female != 2
	gen patent_total = 1
	collapse (sum) patent_female patent_male patent_total, by(patent_year pat_majority_female)
	export delimited using "clean_data/patent_data/inventor_invention_counts.csv", replace
restore

preserve
	gen patent_decade = 1980
	replace patent_decade = 1990 if patent_year > 1989
	replace patent_decade = 2000 if patent_year > 1999
	collapse (sum) patent_male100 patent_female100, by(patent_decade pat_majority_female)
	export delimited using "clean_data/patent_data/female_net_sums.csv", replace
restore

